Aim

Explore how spillover and compensation affect correlations and phenograph clusters in a cyTOF dataset. This will reproduce, among other plots, the following figures from the paper “Compensation of signal spillover in suspension and imaging mass cytometry”: * Figure 3, A-E * Figure S4, A-C

Approach

A 36-antibody panel was used to detect the main immune cell populations in PBMCs (Paper Table S1). This panel was not optimized to avoid spillover effects and contained identical antibodies in different mass channels to facilitate the identification of spillover artifacts. The spillover artefacts will be investigated by using a compensation matrix estimated by stingle stained bead experiments at the same day as the sample.

Preprocessing

Load the required libraries

options(knitr.root.dir = normalizePath('../'))
library(flowCore)
library(Rtsne)
library(Rphenograph)

library(CATALYST)

library(dplyr)
library(data.table)
library(dtplyr)


library(RColorBrewer)
library(gplots)
#library(Scale)

#optimal leaf ordering of hierarchical clustering:
library(cba)
library(Hmisc)
library(mclust)

Define some helper functions

flowFrame2dt <- function (datFCS){
  # converts a flow frame to a data table
  dt = flowCore::exprs(datFCS)
  colnames(dt) = make.names(make.unique(colnames(dt)))
  fil = !is.na(datFCS@parameters@data$desc)
  colnames(dt[, fil]) <- datFCS@parameters@data$desc[fil]
  dt <- data.table::data.table(dt)
  uninames = make.names(make.unique(datFCS@parameters@data$name))
  uni_newnames = make.names(make.unique(datFCS@parameters@data$desc))
  data.table::setnames(dt, uninames[fil], uni_newnames[fil])
  return(dt)
}

censor_dat <- function (x, quant = 0.999){
  # censors the upper limit vector's value using the provided quantile
  q = quantile(x, quant)
  x[x > q] = q
  return(x)
}

Set the configuration variables

# the FCS file used
fn_cells = '../data/Figure_2-3/Bead_PBMC_staining/Exp2/160805_Exp3_cells-only.fcs'
name_condition = 'Exp3'

# the file containing the spillover matirx generated by CATALYST 
fn_sm = c('../data/Figure_2-3/Bead_PBMC_staining/Exp2/160805_Exp3_spillover_matrix.csv')

folder_out = '../data'
do_load = T

# subsample cellnumber
nsamp = 20000

transf = function(x) asinh(x/5)

# defines some channel names that should not be considered
bad_channels = c(  "CD3"   ,  "CD45","SampleID","Time", "Event_length", "MCB102", "MCB104", "MCB105", "MCB106",
                   "MCB108","MCB110","MCB113","MCB115" ,"Beads140" , "File.Number",
                   "DNA193",    "DNA191" ,   "Live194",      "Live195" ,     "beadDist",     "barcode",
                   "89Y"  ,  "102Pd_BC1" , "104Pd_BC2" ,"105Pd_BC3" ,"106Pd_BC4","108Pd_BC5", "110Pd_BC6" ,    
                   "113In_BC7", "115In_BC8", "138Ba","140Ce", "191Ir_DNA1" , 
                   "193Ir_DNA2","195Pt","196Pt","208Pb", "Center","Offset","Width",          
                   "Residual" ,  "102Pd"  , "103Rh" , "104Pd" , "105Pd" , "106Pd","108Pd",
                   "110Pd", "113In","115In" ,"157Gd" ,"155Gd-CD273",'File-Number'
)

col_list  <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72", 
               "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3", 
               "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D", 
               "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999", 
               "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000", 
               "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")

Load the spillover matrixes generated and saved by CATALYST

dat_sms = lapply(fn_sm, function(fn) read.csv(fn, row.names = 1))
names(dat_sms) = sapply(fn_sm, function(fn) basename(fn))

Read the data as a flow frame

ff <- flowCore::read.FCS(fn_cells)
## the text section does not end with delimiter: \\. The last keyword is dropped.
## the text section does not end with delimiter: \\. The last keyword is dropped.

Subsamples to speed the calculations up

set.seed(1234)
if (!is.na(nsamp)){
  ff = ff[sample(nrow(ff), nsamp)]
}

Compensate the data

es <- flowCore::exprs(ff)

dats_compmeta = data.table(compensation= c('raw',paste(names(dat_sms), '_clas'), paste(names(dat_sms),'_nnls')))
dats_compmeta[, is_raw := compensation == 'raw']
ff_list = list()
ff_list[[1]] = ff

for (i in seq_along(dat_sms)){
  tsm = dat_sms[[i]]
  # helps with numerical instabilities sometimes observed during saving/loading
  tsm = round(tsm, digits=8)
  ff_list[[i+1]] = CATALYST::compCytof(ff, tsm)
}

for (i in seq_along(dat_sms)){
  tsm = dat_sms[[i]]
  # There seem some numerical instabilities when loading the data (e.g.  -5.596894e-33). 
  # rounding fixes this issue.
  tsm = round(tsm, digits=8)
  ff_list[[i+(length(dat_sms)+1)]] = CATALYST::compCytof(ff, tsm, method='nnls')
}

names(ff_list) <- dats_compmeta$compensation

Tidy the data into data.tables

This is not necessary, but personal preference

tidy_ff <- function(x, nm_condition){
  x = flowFrame2dt(x)
  x[, id:=paste(as.character(1:.N), nm_condition)]
  x =  melt.data.table(x, id.vars = 'id',variable.name = 'channel', value.name='counts', variable.factor = F)
  x[, counts_transf:= transf(counts)]
  
  setkey(x, id)
  
  return(x)
}

dats= lapply(ff_list, function(x) tidy_ff(x, name_condition))

Clean up some channel names

clean_channels = function(x){
  x[, channel :=as.character(gsub('X','',as.character(channel))) , by=channel]
  x[, channel :=gsub('\\.','-',as.character(.BY)) , by=channel]
  x[, channel :=gsub(' ','-',as.character(.BY)) , by=channel]
  return(x)
}
dats= lapply(dats, clean_channels)

Create a metadata table

# for additional, global cell specific information, e.g. condition
datcell = dats[['raw']] %>%
  dplyr::select(-c(counts, counts_transf, channel)) %>%
  dplyr::filter(!duplicated(id)) %>%
  mutate(condition= name_condition)
setkey(datcell, id)

datmeta =  data.table(name=ff@parameters$name, channel=ff@parameters$desc)
datmeta = clean_channels(datmeta)
setkey(datmeta, channel)

good_channels =unique(datmeta$channel)[!unique(datmeta$channel) %in% bad_channels]

Add information which channels contain antibodies against the same target

datmeta[,antibody:=gsub('.....-','',channel)]

#datmeta[antibody=='CD8b',antibody:='CD8']
datmeta[,n_chan := length(unique(channel)), by=antibody]
datmeta[,metal:=substr(channel, 4, 5)]
datmeta[metal =='', metal:= as.character(1:.N)]

Clustering and dimensionality reduction using TSNE

Cluster the data using phenograph

# adapted from https://github.com/lmweber/FlowSOM-Rtsne-example/blob/master/FlowSOM_Rtsne_example.R

do_phenograph<- function(data, channels, valuevar= 'counts_transf', seed=1234, subsample=FALSE){
  # A helper function
  
  pheno_dat = data.table::dcast.data.table(data[channel %in% channels, ],id~channel,value.var = valuevar)
  all_ids = pheno_dat$id
  
  if (subsample == FALSE){
    subsample=1
  } 
  
  
  
  sampids = pheno_dat[, sample(id, floor(.N*subsample),replace = F)]
  pheno_dat_samp = pheno_dat[id %in%sampids, ]
  ids =pheno_dat_samp$id
  pheno_dat_samp[, id:=NULL]
  set.seed(seed)
  rpheno_out = Rphenograph::Rphenograph(pheno_dat_samp)
  cluster = igraph::membership(rpheno_out[[2]])
  pheno_clust = data.table::data.table(x=ids)
  setnames(pheno_clust, 'x', 'id')
  pheno_clust[, cluster:=factor(cluster[as.character(seq(length(ids)))])]
  data.table::setkey(pheno_clust, 'id')
  pheno_clust = pheno_clust[all_ids ,]
  
  return(pheno_clust)
}
datspheno= lapply(dats, function(dat) do_phenograph(dat, good_channels, valuevar = 'counts_transf', seed=1234))
## Run Rphenograph starts:
##   -Input data of 20000 rows and 35 columns
##   -k is set to 30
##   Finding nearest neighbors...DONE ~ 8.943 s
##   Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 4.678 s
##   Build undirected graph from the weighted links...DONE ~ 2.504 s
##   Run louvain clustering on the graph ...DONE ~ 3.949 s
## Run Rphenograph DONE, totally takes 20.074s.
##   Return a community class
##   -Modularity value: 0.8855295 
##   -Number of clusters: 20
## Run Rphenograph starts:
##   -Input data of 20000 rows and 35 columns
##   -k is set to 30
##   Finding nearest neighbors...DONE ~ 12.066 s
##   Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 5.145 s
##   Build undirected graph from the weighted links...DONE ~ 2.692 s
##   Run louvain clustering on the graph ...DONE ~ 4.467 s
## Run Rphenograph DONE, totally takes 24.37s.
##   Return a community class
##   -Modularity value: 0.8796457 
##   -Number of clusters: 18
## Run Rphenograph starts:
##   -Input data of 20000 rows and 35 columns
##   -k is set to 30
##   Finding nearest neighbors...DONE ~ 7.587 s
##   Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 5.14 s
##   Build undirected graph from the weighted links...DONE ~ 2.523 s
##   Run louvain clustering on the graph ...DONE ~ 3.795 s
## Run Rphenograph DONE, totally takes 19.045s.
##   Return a community class
##   -Modularity value: 0.8882629 
##   -Number of clusters: 22

calculate tsne maps

calc_tsne <- function (input_dat, channels, value_var = "counts", channel_var = "channel", 
                       id_var = "id",  scale = F, verbose = T, dims = 2, seed=1234,...){
  set.seed(seed)
  ids = input_dat[!duplicated(get(id_var)), get(id_var)]
  dt = data.table::dcast(input_dat[(get(channel_var) %in% 
                                      channels) & get(id_var) %in% ids], formula = as.formula(paste(id_var, 
                                                                                                    "~", channel_var)), value.var = value_var)
  if (scale) {
    tsnedat = scale(dt[, channels, with = F])
  }
  else {
    tsnedat = dt[, channels, with = F]
  }
  tsne_out <- Rtsne::Rtsne(tsnedat, verbose = verbose, dims = dims, 
                           ...)
  tsne_out$Y = data.table(tsne_out$Y)
  setnames(tsne_out$Y, names(tsne_out$Y), paste("bh", names(tsne_out$Y), 
                                                sep = "_"))
  tsne_out$Y$id = dt[, get(id_var)]
  data.table::setnames(tsne_out$Y, "id", id_var)
  data.table::setkeyv(tsne_out$Y, id_var)
  tsne_out$channels = channels
  tsne_out$scale = F
  return(tsne_out)
}

# Setup some colors for plotting
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

The TSNE map is calculated on the uncompensated values

ttsne =  calc_tsne(dats[['raw']], good_channels, value_var = 'counts_transf', channel_var = "channel")
## Read the 20000 x 35 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 20000
##  - point 10000 of 20000
## Done in 10.54 seconds (sparsity = 0.006750)!
## Learning embedding...
## Iteration 50: error is 104.475015 (50 iterations in 52.30 seconds)
## Iteration 100: error is 101.438325 (50 iterations in 88.19 seconds)
## Iteration 150: error is 87.019123 (50 iterations in 38.28 seconds)
## Iteration 200: error is 85.054767 (50 iterations in 50.02 seconds)
## Iteration 250: error is 84.248226 (50 iterations in 39.69 seconds)
## Iteration 300: error is 3.666333 (50 iterations in 37.49 seconds)
## Iteration 350: error is 3.412136 (50 iterations in 32.91 seconds)
## Iteration 400: error is 3.238534 (50 iterations in 29.31 seconds)
## Iteration 450: error is 3.116084 (50 iterations in 29.25 seconds)
## Iteration 500: error is 3.023403 (50 iterations in 38.20 seconds)
## Iteration 550: error is 2.950851 (50 iterations in 32.21 seconds)
## Iteration 600: error is 2.892485 (50 iterations in 29.52 seconds)
## Iteration 650: error is 2.843872 (50 iterations in 31.64 seconds)
## Iteration 700: error is 2.802840 (50 iterations in 34.05 seconds)
## Iteration 750: error is 2.767604 (50 iterations in 33.21 seconds)
## Iteration 800: error is 2.737041 (50 iterations in 29.35 seconds)
## Iteration 850: error is 2.710190 (50 iterations in 28.28 seconds)
## Iteration 900: error is 2.686808 (50 iterations in 33.81 seconds)
## Iteration 950: error is 2.666254 (50 iterations in 30.27 seconds)
## Iteration 1000: error is 2.648096 (50 iterations in 32.35 seconds)
## Fitting performed in 750.33 seconds.

The phenograph clusters are plotted on the TSNE map

This corresponds to figure S4C

tclust = datspheno[['raw']]
tdat = dats[['raw']]

p = ggplot(subset(tdat, !duplicated(id))[ttsne$Y][tclust], aes(x=bh_V1, y=bh_V2))+
  geom_point(size=0.3, alpha=1, aes(color=as.factor(cluster)))+
  scale_color_manual(values = col_list)+
  ggtitle('Figure S4C')+
  guides(color=guide_legend(override.aes=list(size=5)))+
  theme(strip.background = element_blank(),
        panel.background=element_rect(fill='white', colour = 'black'),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        plot.background=element_blank(),
        legend.key = element_blank())
p

ggsave(filename = file.path(folder_out,  'figS4c_phenograph-on-tsne.pdf'),plot=p,width=10, height=5)

select phenograph clustering to be used for all downstream analysis

The phenograph based on the uncompensated data is used for all future plots

clusterdat = datspheno[['raw']]

Map the markers on the tsne map

In the following plots the asinh(x/5) transformed intensities were normalized between 0-1 by using the 0.99 percentile of the data. Based on these plots the Fig 3A as well as Fig S4A-B were assembled.

Note that also the classical compensation is plotted, but due to a fixed color scheme not allowing for negatives, it does not get plotted correctly.

ptsne = ttsne
pdats = dats

for (tdatname in names(pdats)){
  tdat = pdats[[tdatname]]
  tdat[, c_counts := censor_dat(counts_transf,0.99), by=channel]
  tdat[, c_counts_scaled := c_counts, by=channel]
  tdat[, c_counts_scaled := (c_counts_scaled/max(c_counts_scaled)), by=channel]
  p = ggplot(subset(tdat, channel %in% good_channels)[ptsne$Y],
             aes(x=bh_V1, y=bh_V2, color=c_counts_scaled))+
    facet_wrap(~channel, scales = "free", ncol = 9)+
    geom_point(alpha=0.5, size=0.3)+
    scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts', limits=c(0, 1))+
    ggtitle(tdatname)+
    theme(strip.background = element_blank(),
          strip.text.x = element_text(size = 11),
          axis.line=element_blank(),
          axis.text.x=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          panel.background=element_blank(),
          panel.border=element_blank(),
          panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),
          plot.background=element_blank()) 
  print(p)
  #ggsave(filename = file.path(folder_out, paste(tdatname, 'tsne_all_markers.png',sep='_')), plot = p,width=15, height=6, dpi = 300)
  
}

Plot heatmaps of the phenograph cluster statistics

The chunck below defines some convenience functions

get_summarydat <- function(ssdat, clusterdat, valuevar){
  # calculates summary statistics per cluster
  summary_dat = ssdat[clusterdat][ ,list(
    median_val = median(get(valuevar), na.rm=T),
    mean_val = median(get(valuevar), na.rm=T),
    cell_cluster=.N
  ), by=.(channel,cluster)]
  return(summary_dat)
}

dat2mat <- function(data, formula, valuevar){
  # calculates a matrix suitable for a heatmap from the long data
  hm_dat = dcast.data.table(data =data, formula =formula,
                            value.var = valuevar)
  
  trownames = hm_dat$cluster
  
  # convert to a matrix
  hm_dat = as.matrix(hm_dat[,-1,with=F])
  row.names(hm_dat) = trownames
  
  return(hm_dat)
}

Heatmap of the cluster medians

This reproduces Figure 3C For each phenograph cluster the cluster median is calculated and a heatmap is generated. The all heatmap are clustered based on the uncompensated data in order to make them visually easily comparable. The colorscake corresponds to the asinh(x/5) transformed count and is kept constant for all the plots to make them directly comparable.

firstclust = 1
pdats = dats

for (i in seq_along(pdats)){
  cur_name = names(pdats)[i]
  hm_dat = pdats[[i]][channel %in% good_channels,] %>%
    get_summarydat(clusterdat, 'counts_transf') %>%
    dat2mat('cluster ~ channel','median_val')
  
  if (firstclust){
    firstclust=F
    tdist = as.dist(1-cor(t(hm_dat), method="pearson"))
    tdist[is.na(tdist)] = 0
    hr <- hclust(tdist, method="ward.D")
    co_r <- order.optimal(tdist, hr$merge)
    hr$merge = co_r$merge
    hr$order = co_r$order
    
    tdist = as.dist(1-cor((hm_dat), method="pearson"))
    tdist[is.na(tdist)] = 0
    hc <- hclust(tdist, method="ward.D")
    co_c <- order.optimal(tdist, hc$merge)
    hc$merge = co_c$merge
    hc$order = co_c$order
  }
  if (any(hm_dat<0)){
    cols =  rev(brewer.pal(11,'RdBu'))
    cmap = colorRampPalette(cols)
    
  } else {
    cols =  rev(brewer.pal(11,'RdBu'))
    cmap = colorRampPalette(cols[6:11])
    
  }
  
  heatmap.2(hm_dat,
            scale ='none',
            trace = "none",
            col=cmap(75),
            Rowv=as.dendrogram(hr),
            #RowSideColors=sel_col_vector[group_levels],
            #dendrogram='column',
            Colv=as.dendrogram(hc),
            density.info ='none',
            #Colv =NA,
            #keyorient=2,                                                  
            xlab = 'Conditions',      
            #sepwidth = c(0,0),
            cexRow=0.9,
            cexCol=0.9,
            margins=c(4,4),
            main=cur_name
            #ylab ='Genes',
            #comments = data.frame(names = row.names(tab_Prot))
  )
}

These heatmaps use arcsinh(x/5) scaled data. There is clearly some spillover vanishing upon compensation. After compensation the same antibodies stained in the different channels look much more comparable. With the ‘classical’ compensation a slight overcompensation is visible that is not visible an the NNLS compensation.

Extra figure not used; It would be interesting how these heatmaps looks on a linear scale. Thus instead of arcsinh scaling, each channel is scalled by dividing it through the maximal cluster median of the untransformed data. Thus the scale is linear and comparable between the plots:

firstclust = 1
pdats = dats

for (i in seq_along(pdats)){
  cur_name = names(pdats)[i]
  hm_dat = pdats[[i]][channel %in% good_channels,] %>%
    get_summarydat(clusterdat, 'counts') %>%
    dat2mat('cluster ~ channel','median_val')
  
  if (firstclust){
    normvec = apply(hm_dat,2,function(x) max(abs(x), na.rm = T))
  }
  hm_dat = t(t(hm_dat)/normvec)
  if (firstclust){
    firstclust=F
    print(1)
    tdist = as.dist(1-cor(t(hm_dat), method="pearson"))
    tdist[is.na(tdist)] = 0
    hr <- hclust(tdist, method="ward.D")
    co_r <- order.optimal(tdist, hr$merge)
    hr$merge = co_r$merge
    hr$order = co_r$order
    
    tdist = as.dist(1-cor((hm_dat), method="pearson"))
    tdist[is.na(tdist)] = 0
    hc <- hclust(tdist, method="ward.D")
    co_c <- order.optimal(tdist, hc$merge)
    hc$merge = co_c$merge
    hc$order = co_c$order
  }
  if (any(hm_dat<0, na.rm = T)){
    cols =  rev(brewer.pal(11,'RdBu'))
    cmap = colorRampPalette(cols)
    
  } else {
    cols =  rev(brewer.pal(11,'RdBu'))
    cmap = colorRampPalette(cols[6:11])
    
  }
  
  heatmap.2(hm_dat,
            scale ='none',
            trace = "none",
            col=cmap(75),
            Rowv=as.dendrogram(hr),
            #RowSideColors=sel_col_vector[group_levels],
            #dendrogram='column',
            Colv=as.dendrogram(hc),
            density.info ='none',
            #Colv =NA,
            #keyorient=2,                                                  
            xlab = 'Conditions',      
            #sepwidth = c(0,0),
            cexRow=0.9,
            cexCol=0.9,
            margins=c(4,4),
            main=cur_name
            #ylab ='Genes',
            #comments = data.frame(names = row.names(tab_Prot))
  )
}
## [1] 1

-> Also with a linear scale the spillover is clearly visible and also correctly compensated.

Look at single cell correlation structure within the clusters

Define a helper function to easily calculatge correlation matrices as well as their p-values

get_cormat <- function(data, xcol, ycol, valuecol, method='pearson', pval = F){
  # Helper function to calculate a correlation matrix from the data
  cormat = dcast.data.table(data, paste(xcol, ycol,sep='~'), value.var = valuecol) %>%
    dplyr::select(-matches(xcol)) %>%
    as.matrix() %>%
    rcorr(type=method)
  if (pval == F){
    cormat = cormat$r
  }
  return(cormat)
}

Single cell correlations within clusters

The following section a) calculates the spearman correlation between all markers and their significance within the single cells of each cluser with more than 200 cells.

The correlation heatmaps correspond to Figure 3E of the paper. The heatmaps are clustered according to the correlation structure of the uncompensated data to ensure direct visual comparability. We choose spearman correlation as it is completly independent from the transformation used. Pearson correlation gives similar results however for asinh transformed counts. The p-values heatmap correspond to the log10(p-value) of the corresponding correlation value.

p_co = 0.005
mincells =200
p_clusts = c(12)
doprint = F
corcluster = list()



cols =  rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols)
pdats = dats
clusts = clusterdat[, list(n=.N),by=cluster][n>mincells, unique(cluster)]
for (clust in clusts){
  
  firstclust = 1
  for (i in seq_along(dats)){
    cur_name = paste(c(names(dats)[i], as.character(clust)), collapse = ' ')
    cormat =pdats[[i]][clusterdat[cluster == clust, id], ][channel %in% good_channels,] %>%
      get_cormat(xcol='id', ycol='channel', valuecol='counts_transf',method = 'spearman', pval = T)
    
    
    cormatp = cormat$P
    cormat = cormat$r
    
    
    corcluster[[length(corcluster)+1]] = data.table(cluster=clust,
                                                    comp= names(pdats)[i],
                                                    nsig=sum(cormatp <p_co, na.rm=T)/2,
                                                    nstrong = (sum(abs(cormat)  > 0.5, na.rm=T)-nrow(cormat))/2)
    if ((clust %in% p_clusts) | is.null(p_clusts)){
      if (firstclust){
        firstclust=F
        #print(1)
        tdist = dist(t(cormat), method="euclidean")
        tdist[is.na(tdist)] = 0
        hr <- hclust(tdist, method="ward.D")
        co_r <- order.optimal(tdist, hr$merge)
        hr$merge = co_r$merge
        hr$order = co_r$order
        
        cormatco = cormatp < p_co
      }
      
      
      
      
      heatmap.2(cormat,
                scale ='none',
                trace = "none",
                col=cmap(75),
                symm=T,
                Rowv=as.dendrogram(hr),
                main=paste('Correlation:',cur_name),
                margins = c(10,10)
      )
      
      cmap = colorRampPalette(cols)
      x=-log10(cormatp)
      x[x>5] = 5
      heatmap.2(x,
                scale ='none',
                trace = "none",
                col=cmap(75),
                symm=T,
                Rowv=as.dendrogram(hr),
                Colv=as.dendrogram(hr),
                main=paste('p-values:', cur_name),
                margins = c(10,10)
      )
    }
    
  }
}  

corcluster = rbindlist(corcluster)

Based on the data generated above, the change in number of significant single cell marker correlations per cluster is investigated. This is the basis for figure 3C.

Not shown in the paper is the ‘classical’ compensation, which leads to sometimes quite a significant amount of artifical correlations due to slight overcompen. This is another argument for the NNLS.

corcluster_sel=corcluster
pdat = corcluster_sel[ ,nsig_frac := nsig/nsig[comp == 'raw'] , by=cluster]
pdat[, cluster_p := factor(x=as.character(cluster), levels = as.character(sort(as.numeric(unique(cluster)))))]
pdat%>%
  ggplot(aes(x=comp, y=nsig_frac, color=cluster_p, group=cluster))+
  # quite ugly code to keep the cluster colors consistent
  scale_color_manual(values = col_list[sort(as.numeric(as.character(unique(pdat$cluster_p))))])+
  geom_point(stat='summary', fun.y=sum) +
  stat_summary(fun.y=sum, geom="line")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.background = element_blank(),
        panel.background=element_rect(fill='white', colour = 'black'),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        plot.background=element_blank(),
        legend.key = element_blank())+
  ggtitle('Figure 3C')+
  expand_limits(y=0)

#ggsave(filename = file.path(folder_out,  'correlation_loss.pdf'),width=3, height=5,useDingbats=FALSE)

Same plot as above, but just showing the raw number of significant correlations.

ggplot(corcluster, aes(x=comp, y=nsig, color=cluster, group=cluster))+
  #facet_wrap(~data, ncol = 1)+
  geom_point(stat='summary', fun.y=sum) +
  stat_summary(fun.y=sum, geom="line")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  expand_limits(y=0)

From here on there is some extra analysis that did not make it to the paper due to space constraints

Correlation of cluster medians

The following heatmap investigates the pearson correlation of the marker medians among the clusters:

firstclust = 1
pdats = dats
  for (i in seq_along(pdats)){
    cur_name = names(pdats)[i]
    cormat = pdats[[i]][channel %in% good_channels,] %>%
      get_summarydat(clusterdat, 'counts') %>%
      get_cormat(xcol='cluster', ycol='channel', valuecol='median_val', method='pearson')
    
    if (firstclust){
      firstclust=F
      print(1)
      tdist = dist(t(cormat), method="euclidean")
      tdist[is.na(tdist)] = 0
      hr <- hclust(tdist, method="ward.D")
      co_r <- order.optimal(tdist, hr$merge)
      hr$merge = co_r$merge
      hr$order = co_r$order
      
    }
    cmap = colorRampPalette(cols)
    heatmap.2(cormat,
              scale ='none',
              trace = "none",
              col=cmap(75),
              symm=T,
              Rowv=as.dendrogram(hr),
              main=cur_name
    )
  }
## [1] 1

-> There is a clear change in correlation, however it is tedious to judge by eye if it makes sense. Thus the section bellow specifically looks into ‘within antibody’ correlation and ‘within metal’ correlations.

Within antibody/metal correlations

In the experiment we have multiple occasions of the same antibody coupled to different metal isotopes. We would expect that upon compensation this correlations are increasing. Further in general correlations within different antibodies using isotopes from the same metal should rather decrease after compensation. However this need not to be the case as the antibodies using the same metal still could be biologically correlated.

helper functions

get_within_between_groupcorr <- function(cormat, grouptab, col_name, col_group){
  # cormat is a named correlation matrix
  # grouptab is a data.table containing groups and names
  # col_name is the name in the grouptab corresponding to the correlation matrix names
  # col_group is the name in the grouptab corresonding to the grouping
  tcormat = cormat
  
  ## Na correlation indicates that a value stays constant. I argue that this is eqivalent to '0' correlation in our case
  # as there would be no relation ship detected
  tcormat[is.na(tcormat)] <- 0
  
  # ignore the diagonal:
  diag(tcormat) <- NA
  
  # create a temporary filter variable
  channnames = colnames(tcormat)
  corgroup_dat = grouptab[, .(fil=list(channnames %in% get(col_name))), by=c(col_group)]
  
  # keep only antibodies with more than one occurences
  corgroup_dat = corgroup_dat[sapply(fil, function(x) sum(x, na.rm = T) >1),]
  # calculate within and between
  corgroup_dat[, ':='(within_cor=mean(abs(tcormat[fil[[1]],fil[[1]]]), na.rm = T)
  ),by=c(col_group)]
  
  # delete the filter variable
  corgroup_dat[, fil:=NULL]
  return(corgroup_dat)
}

# define the 'fisher transform' scale: this is the variance stabilizing transform for correlations
# -> I think that might be appropriate to use here
scale_atanh = scales::trans_new('arctanh', atanh, tanh, breaks =scales::pretty_breaks() )

Do all the calculations:

tcorr_tabs_antibody = list()
tcorr_tabs_metal = list()
pdats = dats

for (i in seq_along(pdats)){
  cur_name = names(pdats)[i]
  cormat = pdats[[i]][channel %in% good_channels,] %>%
    get_summarydat(clusterdat, 'counts') %>%
    get_cormat(xcol='cluster', ycol='channel', valuecol='median_val')
  # within antibodies
  tcorr_tab = get_within_between_groupcorr(cormat = cormat, grouptab = datmeta, col_name = 'channel',col_group = 'antibody')
  tcorr_tab[, dataset:=cur_name]
  tcorr_tabs_antibody = append(tcorr_tabs_antibody,list(tcorr_tab) )
  
  # for within metals
  tcorr_tab = get_within_between_groupcorr(cormat = abs(cormat), grouptab = datmeta, col_name = 'channel',col_group = 'metal')
  tcorr_tab[, dataset:=cur_name]
  tcorr_tabs_metal = append(tcorr_tabs_metal,list(tcorr_tab) )
}

corr_tab_antibody = data.table::rbindlist(tcorr_tabs_antibody)
corr_tab_metal= data.table::rbindlist(tcorr_tabs_metal)

-> This plot looks at the pearson correlation between the median counts of the Phenograph clusters before compensation (raw) and afterwards.

ggplot(corr_tab_antibody, aes(x=dataset, y=within_cor, color=as.factor(antibody), group=antibody))+
  scale_y_continuous(trans = scale_atanh, breaks = c(0.5,0.8,0.90,0.975,0.99,0.999, 0.99999))+
  expand_limits(y=0.99)+
  geom_point()+
  geom_line()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

-> As expected and visible in the heatmaps, the correlation between same antibodies labeled with different metals increases after compensation (in particular CD20: 0.5 -> 0.9)

sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.5 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/lapack/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] mclust_5.3          Hmisc_4.0-3         Formula_1.2-2      
##  [4] survival_2.41-3     lattice_0.20-35     cba_0.2-19         
##  [7] proxy_0.4-17        gplots_3.0.1        RColorBrewer_1.1-2 
## [10] dtplyr_0.0.2        data.table_1.10.4-1 dplyr_0.7.4        
## [13] CATALYST_1.1.5      Rphenograph_0.99.1  igraph_1.1.2       
## [16] ggplot2_2.2.1       Rtsne_0.13          flowCore_1.42.3    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-131        bitops_1.0-6        matrixStats_0.53.0 
##  [4] pbkrtest_0.4-7      httr_1.3.1          rprojroot_1.2      
##  [7] tools_3.4.1         backports_1.1.1     R6_2.2.2           
## [10] rpart_4.1-11        KernSmooth_2.23-15  lazyeval_0.2.0     
## [13] BiocGenerics_0.22.1 mgcv_1.8-22         colorspace_1.3-2   
## [16] nnet_7.3-12         gridExtra_2.3       compiler_3.4.1     
## [19] graph_1.54.0        quantreg_5.33       Biobase_2.36.2     
## [22] htmlTable_1.9       SparseM_1.77        plotly_4.7.1       
## [25] sandwich_2.4-0      labeling_0.3        checkmate_1.8.4    
## [28] caTools_1.17.1      scales_0.5.0        DEoptimR_1.0-8     
## [31] nnls_1.4            mvtnorm_1.0-6       robustbase_0.92-7  
## [34] stringr_1.2.0       digest_0.6.15       foreign_0.8-69     
## [37] minqa_1.2.4         rmarkdown_1.6       base64enc_0.1-3    
## [40] rrcov_1.4-3         pkgconfig_2.0.1     htmltools_0.3.6    
## [43] lme4_1.1-14         plotrix_3.7         htmlwidgets_1.0    
## [46] rlang_0.1.2         shiny_1.0.5         bindr_0.1          
## [49] zoo_1.8-0           jsonlite_1.5        gtools_3.5.0       
## [52] acepack_1.4.1       car_2.1-5           magrittr_1.5       
## [55] Matrix_1.2-11       Rcpp_0.12.15        munsell_0.4.3      
## [58] stringi_1.1.5       multcomp_1.4-8      yaml_2.1.16        
## [61] MASS_7.3-47         plyr_1.8.4          gdata_2.18.0       
## [64] parallel_3.4.1      splines_3.4.1       knitr_1.17         
## [67] corpcor_1.6.9       reshape2_1.4.3      codetools_0.2-15   
## [70] stats4_3.4.1        glue_1.1.1          drc_3.0-1          
## [73] evaluate_0.10.1     latticeExtra_0.6-28 nloptr_1.0.4       
## [76] httpuv_1.3.5        MatrixModels_0.4-1  gtable_0.2.0       
## [79] RANN_2.5.1          purrr_0.2.3         tidyr_0.7.1        
## [82] assertthat_0.2.0    mime_0.5            xtable_1.8-2       
## [85] largeVis_0.2.1      pcaPP_1.9-72        viridisLite_0.2.0  
## [88] tibble_1.3.4        bindrcpp_0.2        cluster_2.0.6      
## [91] TH.data_1.0-8